Load required packages:
library(sf) # spatial data manipulation
## Linking to GEOS 3.8.0, GDAL 2.4.2, PROJ 6.3.0
library(data.table) # fast data manipulation
library(leaflet) # easy mapping in R
library(ggplot2) # plotting
library(rpart); library(rpart.plot) # decision tree models
library(MASS); library(pscl) # Poisson and zero-inflated regression
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
source("R/summary_plots.R") # function for creating summary plots
options(scipen = 99) # disable scientific notation
Road Segments and Traffic: A shapefile of Pennsylvania traffic data has been prepared from PennDOT’s RMSTRAFFIC data, retaining only primary state routes. The shapefile is read into R as a spatial object using st_read function from the sf package:
segments <- st_read("shp/traffic/traffic.shp")
## Reading layer `traffic' from data source `/Users/markegge/projects/lighting_safety_r_demo/shp/traffic/traffic.shp' using driver `ESRI Shapefile'
## Simple feature collection with 15866 features and 8 fields (with 1 geometry empty)
## geometry type: MULTILINESTRING
## dimension: XY
## bbox: xmin: 1191414 ymin: 141037.8 xmax: 2807622 ymax: 1075345
## CRS: 2272
segments$id <- 1:nrow(segments) # assign a unique identifier for each segment
Crashes: Having downloaded the PA Crash data files for 2012 – 2016 (five years is the typical period of analysis for safety investigations), the CSV files are loaded into R using the fread function from the data.table package. Compared with read.csv, fread is much faster, and has sensible defaults (e.g. stringsAsFactors = FALSE). A list object with each year’s data is populated, then the rbindlist from data.table binds together the rows from each object in the list.
crashes <- list()
for(year in as.character(2012:2016)) {
crashes[[year]] <- fread(paste0("data/Statewide_", year, "/CRASH_", year, "_Statewide.csv"))
}
crashes <- rbindlist(crashes) # join the five files into one
The result is a dataset called crashes with 627180 observations.
We then run the provided summary_plots function to generate a PDF with a descriptive plot of each attribute in the dataset.
## Plots generated to out/plots.pdf .
Take a moment to inspect the resulting file out/plots.pdf:
Finally, filter our crashes to those occurring at night, and create a “crash_lighting” attribute, based on the metadata in docs/PA_Crashes_Data_Dictionary.pdf:
# Filter to only crashes with "2 - Dark - No Street Lights" or "3 - Dark - Street Lights"
crashes <- crashes[ILLUMINATION %in% c(2, 3)]
crashes[ILLUMINATION == 2, crash_lighting := "unlit"]
crashes[ILLUMINATION == 3, crash_lighting := "lit"]
Next, we spatially join crashes to road segments by buffering the road segments by 50 feet and then performing a spatial join.
Note, for a spatial join to execute property, both the source and destination layers must be in the same spatial projection. In this case, we use the EPSG:2272 NAD83 / Pennsylvania South (ft) projection, which measures distances in feet.
The st_as_sf function from the sf package creates a spatial object from the crashes dataset based on the latitude and longitude values in the “DEC_LAT” and “DEC_LONG” fields.
crashes <- crashes[!(is.na(DEC_LONG) | is.na(DEC_LAT))] # keep only records with x, y values
crashes <- st_as_sf(crashes, coords = c("DEC_LONG", "DEC_LAT"), crs = 4326) # create sf spatial object
crashes <- st_transform(crashes, 2272) # reproject data to EPSG:2272
# buffer road segments by 50 feet
segment_buffer <- st_buffer(segments, dist = 50, endCapStyle = "FLAT")
# spatially join crashes to road segment data to crashes
joined <- st_join(segment_buffer, crashes, left = FALSE) # inner join
Finally, we create a new data.table of the aggregate crash counts by road segment, and merge this new attribute back to our segments data:
segment_crashes <- as.data.table(st_drop_geometry(joined)) # convert sf to data.table
crash_counts <- segment_crashes[, .(annual_crashes = .N / 5), by = id] # count annual crashes by segment
# merge crash counts back to segments
segments <- merge(segments, crash_counts, by = "id", all.x = TRUE)
# calculate crash rates - crashes per million VMT
segments[is.na(segments$annual_crashes), ]$annual_crashes <- 0
segments$mvmt <- with(segments, (DLY_VMT * 365) / 1000000) # million annual vehicle miles travelled
segments$rate <- with(segments, annual_crashes / mvmt) # annual crashes per 1m annual vmt
Our roadway dataset does not include information about overhead lighting. The section below uses crash data to impute a “lighting” attribute for our road segments, and then uses a decision tree classifier to assign lighting to unknown segments based on known segments.
# count total crashes by segment id and illumination condition
counts <- segment_crashes[, .(crash_count = .N), by = .(id, crash_lighting)]
segment_counts <- dcast(counts, id ~ crash_lighting, # long to wide transform
value.var = "crash_count", fill = 0) # filling missing values with zero
# assign segment_lighting based on majority of crashes (lit or unlit)
segment_counts[, segment_lighting := ifelse(lit >= unlit, "lit", "unlit")]
# join the imputed roadway lighting condition to all road segments
light <- merge(as.data.table(st_drop_geometry(segments)), # create data.table from segments attributes
segment_counts[, .(id, segment_lighting)], # join result, keeping only segment_lighting field
by = "id", all.x = TRUE) # join by id, keeping all segments
# Table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")
##
## lit unlit <NA>
## 6081 7452 2333
For segments with unknown lighting_condition, predict which road segments are illuminated using an rpart decision tree:
fit <- rpart(segment_lighting ~ ST_RT_NO + CTY_CODE + DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = light)
# create a confusion matrix to evaluate classifier performance
actual <- light[!is.na(segment_lighting), segment_lighting]
predicted <- predict(fit, type = "class")
(cm <- as.matrix(table(Actual = actual, Predicted = predicted))) # create the confusion matrix
## Predicted
## Actual lit unlit
## lit 4624 1457
## unlit 1539 5913
accuracy <- sum(diag(cm)) / sum(cm) # number of correctly classified instances per class / number of instances
p <- rowSums(cm) / sum(cm); q <- colSums(cm) / sum(cm)
kappa <- (accuracy - sum(p * q)) / (1 - sum(p * q)) # kappa score indicates moderate agreement
cat("Model is", round(accuracy * 100), "% accurate with a Kappa score of", round(kappa, 2))
## Model is 78 % accurate with a Kappa score of 0.55
Our decision tree is reasonably accurate (~75%) and the Kappa score (0.55) indicates moderate evidence that our model outperforms a naïve approach of simply assigning all attributes to the majority class.
# use decision tree to fill in missing lighting values
predictions <- predict(fit, type = "class", newdata = light[is.na(segment_lighting)])
light[is.na(segment_lighting)]$segment_lighting <- predictions
# Updated table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")
##
## lit unlit
## 7413 8453
# join lighting attribute back to road segments
segments <- merge(segments, light[, .(id, lighting = segment_lighting)], by = "id", all.x = TRUE)
Visualize the results:
# limit map data to Pittsburgh region and reproject to web mercator for leaflet
leaflet_segments <- st_transform(segments[segments$DISTRICT_N %in% c("11", "12"), ], 4326)
pal <- colorFactor(c("yellow", "brown", "orange"), leaflet_segments$lighting)
leaflet(data = leaflet_segments) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addPolylines(color = ~pal(lighting)) %>%
addLegend(pal = pal, values = ~lighting)
Exploratory data analysis allows the analyst to become more familiar with the data and informs the selection of subsequent modelling or data visualization steps.
# convert spatial object to data.table for easy data manipulation
DT <- as.data.table(st_drop_geometry(segments))
# DT <- DT[!is.na(rate)] # one weird segment has no geom or rate
# plot crash counts vs. vmt
# relationship between crashes and vmt seems fairly linear
ggplot(DT, aes(x = mvmt, y = annual_crashes)) +
geom_point() + ggtitle("Crash Count vs. VMT")
Crash Rates:
# map crash rates
qpal <- colorQuantile(palette = "YlOrRd", domain = leaflet_segments$rate, n = 6)
leaflet(data = leaflet_segments) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addPolylines(color = ~qpal(rate)) %>%
addLegend(pal = qpal, values = ~rate, opacity = 1)
# some descriptive plots
hist(DT$rate)
hist(log(DT$rate))
hist(DT[rate < 50]$rate, breaks = 50)
Decision trees are useful tools to determine which dataset attributes our predictive of our outcome variables of interest
# What explains / predicts crash counts? A decision tree:
fit <- rpart(annual_crashes ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT,
data = DT)
rpart.plot(fit)
# What explains / predicts crash rates?
fit <- rpart(rate ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = DT)
rpart.plot(fit)
# box plot of crash rates
boxplot(rate ~ lighting, data = DT[rate < 2])
# are lighting and AADT related?
cor(DT$DLY_VMT, DT$lighting == "lit") # Yes - weak negative correlation
## [1] -0.1467884
hist(DT$mvmt) # VMT has an exponential distribution
hist(DT$annual_crashes) # annual crashes has similar distribution
hist(log(DT$mvmt))
Having completed our exploratory data analysis, we see that there’s a reliable relationship between crash counts and VMT. To estimate the lighting relationship, we’ll fit a regression model controlling for VMT.
Count data is typically Poisson or Negative Binomially distributed, so we’ll use an appropriate regression model.
DT[, total_crashes := as.integer(annual_crashes * 5)] # total 5 year crashes for count model (whole numbers)
# fit a Poisson regression model
fit <- glm(total_crashes ~ lighting + mvmt, data = DT, family = poisson)
summary(fit)
##
## Call:
## glm(formula = total_crashes ~ lighting + mvmt, family = poisson,
## data = DT)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -36.333 -2.399 -1.120 0.743 29.790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0002022 0.0040671 491.80 <0.0000000000000002 ***
## lightingunlit -0.3263883 0.0056339 -57.93 <0.0000000000000002 ***
## mvmt 0.0395376 0.0001177 335.87 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 181931 on 15865 degrees of freedom
## Residual deviance: 128528 on 15863 degrees of freedom
## AIC: 176752
##
## Number of Fisher Scoring iterations: 5
# goodness of fit measure:
1 - pchisq(summary(fit)$deviance, summary(fit)$df.residual) # model does not fit the data (p < 0.05)
## [1] 0
# fit a negative binomial regression model
fit <- MASS::glm.nb(total_crashes ~ lighting + mvmt, data = DT)
summary(fit)
##
## Call:
## MASS::glm.nb(formula = total_crashes ~ lighting + mvmt, data = DT,
## init.theta = 1.132717284, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6615 -0.9306 -0.2972 0.3524 5.2944
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.650005 0.012225 134.97 <0.0000000000000002 ***
## lightingunlit -0.466441 0.016618 -28.07 <0.0000000000000002 ***
## mvmt 0.110142 0.001062 103.67 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.1327) family taken to be 1)
##
## Null deviance: 25897 on 15865 degrees of freedom
## Residual deviance: 17943 on 15863 degrees of freedom
## AIC: 92333
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.1327
## Std. Err.: 0.0157
##
## 2 x log-likelihood: -92324.9520
# goodness of fit measure:
1 - pchisq(summary(fit)$deviance, summary(fit)$df.residual) # model does not fit the data
## [1] 0
# fit a zero inflated negative binomial model
fit <- pscl::zeroinfl(total_crashes ~ lighting + mvmt | mvmt, data = DT)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit)
##
## Call:
## pscl::zeroinfl(formula = total_crashes ~ lighting + mvmt | mvmt, data = DT)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -26.5203 -1.1359 -0.6836 0.4483 49.9393
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1916951 0.0040869 536.28 <0.0000000000000002 ***
## lightingunlit -0.3935126 0.0056469 -69.69 <0.0000000000000002 ***
## mvmt 0.0370332 0.0001227 301.72 <0.0000000000000002 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.14101 0.03981 3.542 0.000397 ***
## mvmt -1.98016 0.06056 -32.698 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 9
## Log-likelihood: -7.621e+04 on 5 Df
# what does this look like?
synth <- data.table(lighting = c(rep("lit", 100), rep("unlit", 100)),
mvmt = seq(0.1, 70, length.out = 100))
fit_aadt <- lm(CUR_AADT ~ poly(mvmt, 3), data = DT)
synth$CUR_AADT <- predict(fit_aadt, newdata = synth)
synth$total_crashes <- predict(fit, newdata = synth)
ggplot(synth, aes(x = mvmt, y = total_crashes, color = lighting)) +
geom_line() +
ggtitle("Estimated Crashes and VMT", "By Lighting Condition")
The zero-inflated negative binomial model outputs show that unlit road segments have 0.39 fewer crashes on average than lit segments, controlling for VMT.
This finding is rather counter-intuitive from a “lighting improves safety” perspective, but makes sense if lighting is installed in locations with higher crash risk (e.g. intersections).